{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget http://knowitall.cs.washington.edu/oqa/data/wikianswers/part-00002.gz\n",
    "# !wget http://knowitall.cs.washington.edu/oqa/data/wikianswers/part-00003.gz\n",
    "# !gzip -d part-00002.gz\n",
    "# !gzip -d part-00003.gz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "6369606"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from glob import glob\n",
    "\n",
    "data = []\n",
    "for f in glob('/home/husein/wikianswers/part-*'):\n",
    "    with open(f) as fopen:\n",
    "        for l in fopen:\n",
    "            data.extend([l_[2:].strip() for l_ in l.split('\\t')][:3])\n",
    "            if len(data) >= 10000000:\n",
    "                break\n",
    "                \n",
    "len(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = [d for d in data if len(d) and len(d.split()) < 1024]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('wikianswers.txt', 'w') as fopen:\n",
    "    fopen.write('\\n'.join(data))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "6369123 wikianswers.txt\r\n"
     ]
    }
   ],
   "source": [
    "!wc -l wikianswers.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "import datasets\n",
    "import json\n",
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = datasets.load_dataset(\n",
    "            'c4',\n",
    "            'en',\n",
    "            streaming=True,\n",
    "        )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "999999it [06:46, 2462.57it/s]\n"
     ]
    }
   ],
   "source": [
    "i = 0\n",
    "with open('english-c4.jsonl', 'w') as fopen:\n",
    "    for d in tqdm(dataset['train']):\n",
    "        t = d['text']\n",
    "        fopen.write(f'{json.dumps(t)}\\n')\n",
    "        i += 1\n",
    "        if i >= 1e6:\n",
    "            break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "32"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from glob import glob\n",
    "\n",
    "english_news = glob('/home/husein/google-translate-english-news/*.requested')\n",
    "len(english_news)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "39"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "english_texts = glob('/home/husein/google-translate-english-texts/*.requested')\n",
    "len(english_texts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "22"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "malay_news = glob('/home/husein/google-translate-malay-news/*.requested')\n",
    "len(malay_news)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hansard = glob('/home/husein/google-translate-malaysia-hansard/*.requested')\n",
    "len(hansard)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "ens = [english_news, english_texts]\n",
    "mss = [malay_news, hansard]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "fopen_ms = open('prepare-english-ms.jsonl', 'w')\n",
    "fopen_en = open('prepare-english-en.jsonl', 'w')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "18837it [00:00, 78483.75it/s]\n",
      "100000it [00:00, 320103.30it/s]\n",
      "100000it [00:00, 313774.17it/s]\n",
      "100000it [00:00, 316199.25it/s]\n",
      "100000it [00:00, 319323.21it/s]\n",
      "100000it [00:00, 312872.57it/s]\n",
      "100000it [00:00, 317276.07it/s]\n",
      "100000it [00:00, 324782.33it/s]\n",
      "100000it [00:00, 315380.17it/s]\n",
      "100000it [00:00, 315094.68it/s]\n",
      "100000it [00:00, 323319.98it/s]\n",
      "100000it [00:00, 314908.49it/s]\n",
      "100000it [00:00, 310526.79it/s]\n",
      "100000it [00:00, 329834.23it/s]\n",
      "100000it [00:00, 320797.88it/s]\n",
      "100000it [00:00, 318976.43it/s]\n",
      "100000it [00:00, 330386.59it/s]\n",
      "100000it [00:00, 309486.54it/s]\n",
      "100000it [00:00, 312360.66it/s]\n",
      "100000it [00:00, 315523.95it/s]\n",
      "100000it [00:00, 319765.31it/s]\n",
      "100000it [00:00, 317384.35it/s]\n",
      "100000it [00:00, 322634.05it/s]\n",
      "100000it [00:00, 321988.11it/s]\n",
      "100000it [00:00, 321600.49it/s]\n",
      "100000it [00:00, 317498.47it/s]\n",
      "100000it [00:00, 323534.46it/s]\n",
      "100000it [00:00, 317301.27it/s]\n",
      "100000it [00:00, 315565.25it/s]\n",
      "100000it [00:00, 323572.40it/s]\n",
      "99999it [00:00, 316339.65it/s]\n",
      "100000it [00:00, 312975.76it/s]\n",
      "99990it [00:00, 262732.58it/s]\n",
      "99997it [00:00, 259242.66it/s]\n",
      "99997it [00:00, 260473.50it/s]\n",
      "99996it [00:00, 263162.97it/s]\n",
      "99998it [00:00, 236403.57it/s]\n",
      "99993it [00:00, 264497.36it/s]\n",
      "7616it [00:00, 262669.96it/s]\n",
      "99995it [00:00, 258223.65it/s]\n",
      "99995it [00:00, 259134.49it/s]\n",
      "99997it [00:00, 258681.76it/s]\n",
      "99992it [00:00, 257760.27it/s]\n",
      "99994it [00:00, 261255.68it/s]\n",
      "99993it [00:00, 257367.72it/s]\n",
      "99988it [00:00, 255555.00it/s]\n",
      "99993it [00:00, 264089.49it/s]\n",
      "99999it [00:00, 255397.92it/s]\n",
      "99992it [00:00, 262182.35it/s]\n",
      "99994it [00:00, 258836.04it/s]\n",
      "99996it [00:00, 257913.09it/s]\n",
      "99999it [00:00, 260824.22it/s]\n",
      "99994it [00:00, 253324.30it/s]\n",
      "99996it [00:00, 258361.60it/s]\n",
      "99993it [00:00, 254997.36it/s]\n",
      "99995it [00:00, 258115.58it/s]\n",
      "99994it [00:00, 261641.13it/s]\n",
      "99990it [00:00, 257231.20it/s]\n",
      "99996it [00:00, 257697.26it/s]\n",
      "99993it [00:00, 264373.15it/s]\n",
      "99996it [00:00, 254774.67it/s]\n",
      "99991it [00:00, 262448.30it/s]\n",
      "99996it [00:00, 259097.86it/s]\n",
      "99995it [00:00, 256692.70it/s]\n",
      "99997it [00:00, 267104.28it/s]\n",
      "99992it [00:00, 254598.37it/s]\n",
      "99994it [00:00, 261642.60it/s]\n",
      "99998it [00:00, 262723.81it/s]\n",
      "99992it [00:00, 258879.59it/s]\n",
      "99999it [00:00, 265930.73it/s]\n",
      "99996it [00:00, 260479.31it/s]\n",
      "87609it [00:00, 315569.93it/s]\n",
      "100000it [00:00, 305045.39it/s]\n",
      "100000it [00:00, 305757.43it/s]\n",
      "100000it [00:00, 297596.06it/s]\n",
      "99897it [00:00, 312427.03it/s]\n",
      "99999it [00:00, 302601.46it/s]\n",
      "99992it [00:00, 303628.47it/s]\n",
      "99994it [00:00, 338747.98it/s]\n",
      "99997it [00:00, 302049.09it/s]\n",
      "99998it [00:00, 303313.57it/s]\n",
      "100000it [00:00, 300718.47it/s]\n",
      "99989it [00:00, 303043.46it/s]\n",
      "100000it [00:00, 305761.22it/s]\n",
      "99994it [00:00, 310599.91it/s]\n",
      "100000it [00:00, 300090.22it/s]\n",
      "100000it [00:00, 299666.78it/s]\n",
      "100000it [00:00, 303198.61it/s]\n",
      "100000it [00:00, 304387.90it/s]\n",
      "100000it [00:00, 313112.21it/s]\n",
      "100000it [00:00, 299170.24it/s]\n",
      "100000it [00:00, 307592.07it/s]\n",
      "99995it [00:00, 317218.93it/s]\n",
      "43098it [00:01, 28224.45it/s]\n",
      "36329it [00:01, 28258.90it/s]\n",
      "42876it [00:01, 28525.27it/s]\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "import json\n",
    "\n",
    "for fs in ens:\n",
    "    for f in fs:\n",
    "        with open(f) as fopen:\n",
    "            for l in tqdm(fopen):\n",
    "                data = json.loads(l)\n",
    "                en = data['src']\n",
    "                ms = data['r']['result']\n",
    "                if len(en):\n",
    "                    fopen_en.write(f'{json.dumps(en)}\\n')\n",
    "                if len(ms):\n",
    "                    fopen_ms.write(f'{json.dumps(ms)}\\n')\n",
    "                    \n",
    "for fs in mss:\n",
    "    for f in fs:\n",
    "        with open(f) as fopen:\n",
    "            for l in tqdm(fopen):\n",
    "                data = json.loads(l)\n",
    "                if isinstance(data['src'], dict):\n",
    "                    s = data['src']['cleaned']\n",
    "                else:\n",
    "                    s = data['src']\n",
    "                ms = s\n",
    "                en = data['r']['result']\n",
    "                \n",
    "                if len(en):\n",
    "                    fopen_en.write(f'{json.dumps(en)}\\n')\n",
    "                if len(ms):\n",
    "                    fopen_ms.write(f'{json.dumps(ms)}\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "fopen_ms.close()\n",
    "fopen_ms.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
